1 /******************************************************************************
2 * arch/x86/pv/emul-priv-op.c
4 * Emulate privileged instructions for PV guests
6 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; If not, see <http://www.gnu.org/licenses/>.
22 #include <xen/errno.h>
23 #include <xen/event.h>
24 #include <xen/guest_access.h>
25 #include <xen/iocap.h>
26 #include <xen/spinlock.h>
27 #include <xen/trace.h>
31 #include <asm/debugreg.h>
33 #include <asm/hypercall.h>
34 #include <asm/mc146818rtc.h>
36 #include <asm/pv/domain.h>
37 #include <asm/pv/traps.h>
38 #include <asm/shared.h>
39 #include <asm/traps.h>
40 #include <asm/x86_emulate.h>
44 #include "../x86_64/mmconfig.h"
49 struct x86_emulate_ctxt ctxt;
51 unsigned long base, limit;
57 /* I/O emulation support. Helper routines for, and type of, the stack stub. */
58 void host_to_guest_gpr_switch(struct cpu_user_regs *);
59 unsigned long guest_to_host_gpr_switch(unsigned long);
61 typedef void io_emul_stub_t(struct cpu_user_regs *);
63 static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
64 unsigned int port, unsigned int bytes)
66 struct stubs *this_stubs = &this_cpu(stubs);
67 unsigned long stub_va = this_stubs->addr + STUB_BUF_SIZE / 2;
69 bool use_quirk_stub = false;
71 if ( !ctxt->io_emul_stub )
73 map_domain_page(_mfn(this_stubs->mfn)) + (stub_va & ~PAGE_MASK);
75 /* call host_to_guest_gpr_switch */
76 ctxt->io_emul_stub[0] = 0xe8;
77 disp = (long)host_to_guest_gpr_switch - (stub_va + 5);
78 BUG_ON((int32_t)disp != disp);
79 *(int32_t *)&ctxt->io_emul_stub[1] = disp;
81 if ( unlikely(ioemul_handle_quirk) )
82 use_quirk_stub = ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[5],
85 if ( !use_quirk_stub )
88 ctxt->io_emul_stub[5] = (bytes != 2) ? 0x90 : 0x66;
89 /* <io-access opcode> */
90 ctxt->io_emul_stub[6] = opcode;
92 ctxt->io_emul_stub[7] = !(opcode & 8) ? port : 0x90;
93 /* ret (jumps to guest_to_host_gpr_switch) */
94 ctxt->io_emul_stub[8] = 0xc3;
97 BUILD_BUG_ON(STUB_BUF_SIZE / 2 < MAX(9, /* Default emul stub */
98 5 + IOEMUL_QUIRK_STUB_BYTES));
100 /* Handy function-typed pointer to the stub. */
101 return (void *)stub_va;
105 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
106 static bool iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
108 unsigned int cpl = guest_kernel_mode(v, regs) ?
109 (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
111 ASSERT((v->arch.pv.iopl & ~X86_EFLAGS_IOPL) == 0);
113 return IOPL(cpl) <= v->arch.pv.iopl;
116 /* Has the guest requested sufficient permission for this I/O access? */
117 static bool guest_io_okay(unsigned int port, unsigned int bytes,
118 struct vcpu *v, struct cpu_user_regs *regs)
120 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
121 const bool user_mode = !(v->arch.flags & TF_kernel_mode);
123 if ( iopl_ok(v, regs) )
126 if ( (port + bytes) <= v->arch.pv.iobmp_limit )
128 union { uint8_t bytes[2]; uint16_t mask; } x;
131 * Grab permission bytes from guest space. Inaccessible bytes are
132 * read as 0xff (no access allowed).
137 switch ( __copy_from_guest_offset(x.bytes, v->arch.pv.iobmp,
140 default: x.bytes[0] = ~0;
142 case 1: x.bytes[1] = ~0;
150 if ( (x.mask & (((1 << bytes) - 1) << (port & 7))) == 0 )
157 /* Has the administrator granted sufficient permission for this I/O access? */
158 static bool admin_io_okay(unsigned int port, unsigned int bytes,
159 const struct domain *d)
162 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
163 * We never permit direct access to that register.
165 if ( (port == 0xcf8) && (bytes == 4) )
168 /* We also never permit direct access to the RTC/CMOS registers. */
169 if ( ((port & ~1) == RTC_PORT(0)) )
172 return ioports_access_permitted(d, port, port + bytes - 1);
175 static bool pci_cfg_ok(struct domain *currd, unsigned int start,
176 unsigned int size, uint32_t *write)
178 uint32_t machine_bdf;
180 if ( !is_hardware_domain(currd) )
183 if ( !CF8_ENABLED(currd->arch.pci_cf8) )
186 machine_bdf = CF8_BDF(currd->arch.pci_cf8);
189 const unsigned long *ro_map = pci_get_ro_map(0);
191 if ( ro_map && test_bit(machine_bdf, ro_map) )
194 start |= CF8_ADDR_LO(currd->arch.pci_cf8);
195 /* AMD extended configuration space access? */
196 if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
197 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
198 boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
202 if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
204 if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
205 start |= CF8_ADDR_HI(currd->arch.pci_cf8);
209 xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
210 start, start + size - 1, 0) == 0 :
211 pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
214 static uint32_t guest_io_read(unsigned int port, unsigned int bytes,
215 struct domain *currd)
218 unsigned int shift = 0;
220 if ( admin_io_okay(port, bytes, currd) )
224 case 1: return inb(port);
225 case 2: return inw(port);
226 case 4: return inl(port);
232 unsigned int size = 1;
233 uint32_t sub_data = ~0;
235 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
237 sub_data = pv_pit_handler(port, 0, 0);
239 else if ( port == RTC_PORT(0) )
241 sub_data = currd->arch.cmos_idx;
243 else if ( (port == RTC_PORT(1)) &&
244 ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
248 spin_lock_irqsave(&rtc_lock, flags);
249 outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
250 sub_data = inb(RTC_PORT(1));
251 spin_unlock_irqrestore(&rtc_lock, flags);
253 else if ( (port == 0xcf8) && (bytes == 4) )
256 sub_data = currd->arch.pci_cf8;
258 else if ( (port & 0xfffc) == 0xcfc )
260 size = min(bytes, 4 - (port & 3));
263 if ( pci_cfg_ok(currd, port & 3, size, NULL) )
264 sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
270 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
279 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
283 unsigned int width, i, match = 0;
286 if ( !v->arch.pv.dr7_emul || !(v->arch.pv.ctrlreg[4] & X86_CR4_DE) )
289 for ( i = 0; i < 4; i++ )
291 if ( !(v->arch.pv.dr7_emul & (3 << (i * DR_ENABLE_SIZE))) )
294 start = v->arch.dr[i];
297 switch ( (v->arch.dr7 >>
298 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
300 case DR_LEN_1: width = 1; break;
301 case DR_LEN_2: width = 2; break;
302 case DR_LEN_4: width = 4; break;
303 case DR_LEN_8: width = 8; break;
306 if ( (start < (port + len)) && ((start + width) > port) )
313 static int read_io(unsigned int port, unsigned int bytes,
314 unsigned long *val, struct x86_emulate_ctxt *ctxt)
316 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
317 struct vcpu *curr = current;
318 struct domain *currd = current->domain;
320 /* INS must not come here. */
321 ASSERT((ctxt->opcode & ~9) == 0xe4);
323 if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
324 return X86EMUL_UNHANDLEABLE;
326 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
328 if ( admin_io_okay(port, bytes, currd) )
330 io_emul_stub_t *io_emul =
331 io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
337 *val = guest_io_read(port, bytes, currd);
342 static void guest_io_write(unsigned int port, unsigned int bytes,
343 uint32_t data, struct domain *currd)
345 if ( admin_io_okay(port, bytes, currd) )
350 outb((uint8_t)data, port);
351 if ( amd_acpi_c1e_quirk )
352 amd_check_disable_c1e(port, (uint8_t)data);
355 outw((uint16_t)data, port);
366 unsigned int size = 1;
368 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
370 pv_pit_handler(port, (uint8_t)data, 1);
372 else if ( port == RTC_PORT(0) )
374 currd->arch.cmos_idx = data;
376 else if ( (port == RTC_PORT(1)) &&
377 ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
381 if ( pv_rtc_handler )
382 pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
383 spin_lock_irqsave(&rtc_lock, flags);
384 outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
385 outb(data, RTC_PORT(1));
386 spin_unlock_irqrestore(&rtc_lock, flags);
388 else if ( (port == 0xcf8) && (bytes == 4) )
391 currd->arch.pci_cf8 = data;
393 else if ( (port & 0xfffc) == 0xcfc )
395 size = min(bytes, 4 - (port & 3));
398 if ( pci_cfg_ok(currd, port & 3, size, &data) )
399 pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
411 static int write_io(unsigned int port, unsigned int bytes,
412 unsigned long val, struct x86_emulate_ctxt *ctxt)
414 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
415 struct vcpu *curr = current;
416 struct domain *currd = current->domain;
418 /* OUTS must not come here. */
419 ASSERT((ctxt->opcode & ~9) == 0xe6);
421 if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
422 return X86EMUL_UNHANDLEABLE;
424 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
426 if ( admin_io_okay(port, bytes, currd) )
428 io_emul_stub_t *io_emul =
429 io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
432 if ( (bytes == 1) && amd_acpi_c1e_quirk )
433 amd_check_disable_c1e(port, val);
437 guest_io_write(port, bytes, val, currd);
442 static int read_segment(enum x86_segment seg,
443 struct segment_register *reg,
444 struct x86_emulate_ctxt *ctxt)
446 /* Check if this is an attempt to access the I/O bitmap. */
447 if ( seg == x86_seg_tr )
449 switch ( ctxt->opcode )
451 case 0x6c ... 0x6f: /* ins / outs */
452 case 0xe4 ... 0xe7: /* in / out (immediate port) */
453 case 0xec ... 0xef: /* in / out (port in %dx) */
454 /* Defer the check to priv_op_{read,write}_io(). */
459 if ( ctxt->addr_size < 64 )
462 unsigned int sel, ar;
466 case x86_seg_cs: sel = ctxt->regs->cs; break;
467 case x86_seg_ds: sel = read_sreg(ds); break;
468 case x86_seg_es: sel = read_sreg(es); break;
469 case x86_seg_fs: sel = read_sreg(fs); break;
470 case x86_seg_gs: sel = read_sreg(gs); break;
471 case x86_seg_ss: sel = ctxt->regs->ss; break;
472 default: return X86EMUL_UNHANDLEABLE;
475 if ( !pv_emul_read_descriptor(sel, current, ®->base,
477 return X86EMUL_UNHANDLEABLE;
487 if ( !is_x86_user_segment(seg) )
488 return X86EMUL_UNHANDLEABLE;
492 reg->base = rdfsbase();
495 reg->base = rdgsbase();
502 reg->type = _SEGMENT_WR >> 8;
503 if ( seg == x86_seg_cs )
505 reg->type |= _SEGMENT_CODE >> 8;
517 * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
518 * Also do this for consistency for non-conforming code segments.
520 if ( (seg == x86_seg_ss ||
521 (seg == x86_seg_cs &&
522 !(reg->type & (_SEGMENT_EC >> 8)))) &&
523 guest_kernel_mode(current, ctxt->regs) )
529 static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
530 unsigned int bytes, unsigned long limit,
531 enum x86_segment seg,
532 struct x86_emulate_ctxt *ctxt,
535 int rc = X86EMUL_OKAY;
537 *addr = base + offset;
539 if ( ctxt->addr_size < 64 )
541 if ( limit < bytes - 1 || offset > limit - bytes + 1 )
542 rc = X86EMUL_EXCEPTION;
543 *addr = (uint32_t)*addr;
545 else if ( !__addr_ok(*addr) )
546 rc = X86EMUL_EXCEPTION;
548 if ( unlikely(rc == X86EMUL_EXCEPTION) )
549 x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
556 static int rep_ins(uint16_t port,
557 enum x86_segment seg, unsigned long offset,
558 unsigned int bytes_per_rep, unsigned long *reps,
559 struct x86_emulate_ctxt *ctxt)
561 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
562 struct vcpu *curr = current;
563 struct domain *currd = current->domain;
564 unsigned long goal = *reps;
565 struct segment_register sreg;
568 ASSERT(seg == x86_seg_es);
572 if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
573 return X86EMUL_UNHANDLEABLE;
575 rc = read_segment(x86_seg_es, &sreg, ctxt);
576 if ( rc != X86EMUL_OKAY )
580 return X86EMUL_UNHANDLEABLE;
582 (sreg.type & (_SEGMENT_CODE >> 8)) ||
583 !(sreg.type & (_SEGMENT_WR >> 8)) )
585 x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
586 return X86EMUL_EXCEPTION;
589 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
591 while ( *reps < goal )
593 unsigned int data = guest_io_read(port, bytes_per_rep, currd);
596 rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
597 sreg.limit, x86_seg_es, ctxt, &addr);
598 if ( rc != X86EMUL_OKAY )
601 if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
603 x86_emul_pagefault(PFEC_write_access,
604 addr + bytes_per_rep - rc, ctxt);
605 return X86EMUL_EXCEPTION;
610 if ( poc->bpmatch || hypercall_preempt_check() )
613 /* x86_emulate() clips the repetition count to ensure we don't wrap. */
614 if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
615 offset -= bytes_per_rep;
617 offset += bytes_per_rep;
623 static int rep_outs(enum x86_segment seg, unsigned long offset,
625 unsigned int bytes_per_rep, unsigned long *reps,
626 struct x86_emulate_ctxt *ctxt)
628 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
629 struct vcpu *curr = current;
630 struct domain *currd = current->domain;
631 unsigned long goal = *reps;
632 struct segment_register sreg;
637 if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
638 return X86EMUL_UNHANDLEABLE;
640 rc = read_segment(seg, &sreg, ctxt);
641 if ( rc != X86EMUL_OKAY )
645 return X86EMUL_UNHANDLEABLE;
647 ((sreg.type & (_SEGMENT_CODE >> 8)) &&
648 !(sreg.type & (_SEGMENT_WR >> 8))) )
650 x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
653 return X86EMUL_EXCEPTION;
656 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
658 while ( *reps < goal )
660 unsigned int data = 0;
663 rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
664 sreg.limit, seg, ctxt, &addr);
665 if ( rc != X86EMUL_OKAY )
668 if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
670 x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
671 return X86EMUL_EXCEPTION;
674 guest_io_write(port, bytes_per_rep, data, currd);
678 if ( poc->bpmatch || hypercall_preempt_check() )
681 /* x86_emulate() clips the repetition count to ensure we don't wrap. */
682 if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
683 offset -= bytes_per_rep;
685 offset += bytes_per_rep;
691 static int read_cr(unsigned int reg, unsigned long *val,
692 struct x86_emulate_ctxt *ctxt)
694 const struct vcpu *curr = current;
698 case 0: /* Read CR0 */
699 *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv.ctrlreg[0];
702 case 2: /* Read CR2 */
703 case 4: /* Read CR4 */
704 *val = curr->arch.pv.ctrlreg[reg];
707 case 3: /* Read CR3 */
709 const struct domain *currd = curr->domain;
712 if ( !is_pv_32bit_domain(currd) )
714 mfn = pagetable_get_mfn(curr->arch.guest_table);
715 *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn)));
720 map_domain_page(pagetable_get_mfn(curr->arch.guest_table));
722 mfn = l4e_get_mfn(*pl4e);
723 unmap_domain_page(pl4e);
724 *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn)));
726 /* PTs should not be shared */
727 BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
732 return X86EMUL_UNHANDLEABLE;
735 static int write_cr(unsigned int reg, unsigned long val,
736 struct x86_emulate_ctxt *ctxt)
738 struct vcpu *curr = current;
742 case 0: /* Write CR0 */
743 if ( (val ^ read_cr0()) & ~X86_CR0_TS )
745 gdprintk(XENLOG_WARNING,
746 "Attempt to change unmodifiable CR0 flags\n");
749 do_fpu_taskswitch(!!(val & X86_CR0_TS));
752 case 2: /* Write CR2 */
753 curr->arch.pv.ctrlreg[2] = val;
754 arch_set_cr2(curr, val);
757 case 3: /* Write CR3 */
759 struct domain *currd = curr->domain;
761 struct page_info *page;
764 gfn = !is_pv_32bit_domain(currd)
765 ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
766 page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
769 rc = new_guest_cr3(page_to_mfn(page));
776 case -ERESTART: /* retry after preemption */
777 return X86EMUL_RETRY;
782 case 4: /* Write CR4 */
784 * If this write will disable FSGSBASE, refresh Xen's idea of the
785 * guest bases now that they can no longer change.
787 if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) &&
788 !(val & X86_CR4_FSGSBASE) )
790 curr->arch.pv.fs_base = __rdfsbase();
791 curr->arch.pv.gs_base_kernel = __rdgsbase();
794 curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
795 write_cr4(pv_make_cr4(curr));
796 ctxt_switch_levelling(curr);
800 return X86EMUL_UNHANDLEABLE;
803 static inline uint64_t guest_misc_enable(uint64_t val)
805 val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
806 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
807 val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
808 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
809 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
813 static inline bool is_cpufreq_controller(const struct domain *d)
815 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
816 is_hardware_domain(d));
819 static int read_msr(unsigned int reg, uint64_t *val,
820 struct x86_emulate_ctxt *ctxt)
822 const struct vcpu *curr = current;
823 const struct domain *currd = curr->domain;
824 bool vpmu_msr = false;
827 if ( (ret = guest_rdmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
829 if ( ret == X86EMUL_EXCEPTION )
830 x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
840 if ( is_pv_32bit_domain(currd) )
842 *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase()
843 : curr->arch.pv.fs_base;
847 if ( is_pv_32bit_domain(currd) )
849 *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase()
850 : curr->arch.pv.gs_base_kernel;
853 case MSR_SHADOW_GS_BASE:
854 if ( is_pv_32bit_domain(currd) )
856 *val = curr->arch.pv.gs_base_user;
860 *val = currd->arch.vtsc ? pv_soft_rdtsc(curr, ctxt->regs) : rdtsc();
864 /* Hide unknown bits, and unconditionally hide SVME from guests. */
865 *val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME;
867 * Hide the 64-bit features from 32-bit guests. SCE has
868 * vendor-dependent behaviour.
870 if ( is_pv_32bit_domain(currd) )
871 *val &= ~(EFER_LME | EFER_LMA |
872 (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
876 case MSR_K7_FID_VID_CTL:
877 case MSR_K7_FID_VID_STATUS:
878 case MSR_K8_PSTATE_LIMIT:
879 case MSR_K8_PSTATE_CTRL:
880 case MSR_K8_PSTATE_STATUS:
889 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
891 if ( unlikely(is_cpufreq_controller(currd)) )
896 case MSR_IA32_UCODE_REV:
897 BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
898 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
900 if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
902 /* As documented in the SDM: Do a CPUID 1 here */
907 case MSR_IA32_MISC_ENABLE:
909 *val = guest_misc_enable(*val);
912 case MSR_IA32_PERF_CAPABILITIES:
913 /* No extra capabilities are supported. */
917 case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
918 case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
919 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
920 case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
921 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
925 case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
926 case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
927 if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
929 if ( vpmu_do_rdmsr(reg, val) )
936 rc = vmce_rdmsr(reg, val);
943 /* Everyone can read the MSR space. */
944 /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
945 if ( rdmsr_safe(reg, *val) )
950 return X86EMUL_UNHANDLEABLE;
953 static int write_msr(unsigned int reg, uint64_t val,
954 struct x86_emulate_ctxt *ctxt)
956 struct vcpu *curr = current;
957 const struct domain *currd = curr->domain;
958 bool vpmu_msr = false;
961 if ( (ret = guest_wrmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
963 if ( ret == X86EMUL_EXCEPTION )
964 x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
975 if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
978 curr->arch.pv.fs_base = val;
982 if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
985 curr->arch.pv.gs_base_kernel = val;
988 case MSR_SHADOW_GS_BASE:
989 if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
992 curr->arch.pv.gs_base_user = val;
995 case MSR_K7_FID_VID_STATUS:
996 case MSR_K7_FID_VID_CTL:
997 case MSR_K8_PSTATE_LIMIT:
998 case MSR_K8_PSTATE_CTRL:
999 case MSR_K8_PSTATE_STATUS:
1000 case MSR_K8_PSTATE0:
1001 case MSR_K8_PSTATE1:
1002 case MSR_K8_PSTATE2:
1003 case MSR_K8_PSTATE3:
1004 case MSR_K8_PSTATE4:
1005 case MSR_K8_PSTATE5:
1006 case MSR_K8_PSTATE6:
1007 case MSR_K8_PSTATE7:
1009 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1011 if ( likely(!is_cpufreq_controller(currd)) ||
1012 wrmsr_safe(reg, val) == 0 )
1013 return X86EMUL_OKAY;
1016 case MSR_AMD64_NB_CFG:
1017 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1018 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
1020 if ( !is_hwdom_pinned_vcpu(curr) )
1021 return X86EMUL_OKAY;
1022 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
1023 ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
1025 if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
1026 return X86EMUL_OKAY;
1029 case MSR_FAM10H_MMIO_CONF_BASE:
1030 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1031 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
1033 if ( !is_hwdom_pinned_vcpu(curr) )
1034 return X86EMUL_OKAY;
1035 if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
1037 if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
1040 ~(FAM10H_MMIO_CONF_ENABLE |
1041 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
1042 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
1043 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
1044 FAM10H_MMIO_CONF_BASE_SHIFT))) )
1046 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
1047 return X86EMUL_OKAY;
1050 case MSR_IA32_UCODE_REV:
1051 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1053 if ( !is_hwdom_pinned_vcpu(curr) )
1054 return X86EMUL_OKAY;
1055 if ( rdmsr_safe(reg, temp) )
1059 return X86EMUL_OKAY;
1061 case MSR_IA32_MISC_ENABLE:
1063 if ( val != guest_misc_enable(temp) )
1065 return X86EMUL_OKAY;
1067 case MSR_IA32_MPERF:
1068 case MSR_IA32_APERF:
1069 if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_INTEL | X86_VENDOR_AMD)) )
1071 if ( likely(!is_cpufreq_controller(currd)) ||
1072 wrmsr_safe(reg, val) == 0 )
1073 return X86EMUL_OKAY;
1076 case MSR_IA32_PERF_CTL:
1077 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1079 if ( likely(!is_cpufreq_controller(currd)) ||
1080 wrmsr_safe(reg, val) == 0 )
1081 return X86EMUL_OKAY;
1084 case MSR_IA32_THERM_CONTROL:
1085 case MSR_IA32_ENERGY_PERF_BIAS:
1086 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1088 if ( !is_hwdom_pinned_vcpu(curr) || wrmsr_safe(reg, val) == 0 )
1089 return X86EMUL_OKAY;
1092 case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
1093 case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
1094 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
1095 case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
1096 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
1099 case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
1100 case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
1101 if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
1103 if ( (vpmu_mode & XENPMU_MODE_ALL) &&
1104 !is_hardware_domain(currd) )
1105 return X86EMUL_OKAY;
1107 if ( vpmu_do_wrmsr(reg, val, 0) )
1109 return X86EMUL_OKAY;
1114 rc = vmce_wrmsr(reg, val);
1118 return X86EMUL_OKAY;
1120 if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
1122 gdprintk(XENLOG_WARNING,
1123 "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
1125 return X86EMUL_OKAY;
1128 return X86EMUL_UNHANDLEABLE;
1131 /* Name it differently to avoid clashing with wbinvd() */
1132 static int _wbinvd(struct x86_emulate_ctxt *ctxt)
1134 /* Ignore the instruction if unprivileged. */
1135 if ( !cache_flush_permitted(current->domain) )
1137 * Non-physdev domain attempted WBINVD; ignore for now since
1138 * newer linux uses this in some start-of-day timing loops.
1144 return X86EMUL_OKAY;
1147 int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
1148 struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
1150 guest_cpuid(current, leaf, subleaf, res);
1152 return X86EMUL_OKAY;
1155 static int validate(const struct x86_emulate_state *state,
1156 struct x86_emulate_ctxt *ctxt)
1158 switch ( ctxt->opcode )
1160 case 0x6c ... 0x6f: /* ins / outs */
1161 case 0xe4 ... 0xe7: /* in / out (immediate port) */
1162 case 0xec ... 0xef: /* in / out (port in %dx) */
1163 case X86EMUL_OPC(0x0f, 0x06): /* clts */
1164 case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
1165 case X86EMUL_OPC(0x0f, 0x20) ...
1166 X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
1167 case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
1168 case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
1169 case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
1170 case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
1171 return X86EMUL_OKAY;
1173 case 0xfa: case 0xfb: /* cli / sti */
1174 if ( !iopl_ok(current, ctxt->regs) )
1177 * This is just too dangerous to allow, in my opinion. Consider if the
1178 * caller then tries to reenable interrupts using POPF: we can't trap
1179 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1181 vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
1183 return X86EMUL_DONE;
1185 case X86EMUL_OPC(0x0f, 0x01):
1187 unsigned int modrm_rm, modrm_reg;
1189 if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
1190 (modrm_rm & 7) != 1 )
1192 switch ( modrm_reg & 7 )
1194 case 2: /* xsetbv */
1195 case 7: /* rdtscp */
1196 return X86EMUL_OKAY;
1202 return X86EMUL_UNHANDLEABLE;
1205 static int insn_fetch(enum x86_segment seg,
1206 unsigned long offset,
1209 struct x86_emulate_ctxt *ctxt)
1211 const struct priv_op_ctxt *poc =
1212 container_of(ctxt, struct priv_op_ctxt, ctxt);
1214 unsigned long addr = poc->cs.base + offset;
1216 ASSERT(seg == x86_seg_cs);
1218 /* We don't mean to emulate any branches. */
1220 return X86EMUL_UNHANDLEABLE;
1222 rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
1223 x86_seg_cs, ctxt, &addr);
1224 if ( rc != X86EMUL_OKAY )
1227 if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
1230 * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
1231 * cpu_has_nx, but we'd then need a "fetch" variant of
1232 * __copy_from_user() respecting NX, SMEP, and protection keys.
1234 x86_emul_pagefault(0, addr + bytes - rc, ctxt);
1235 return X86EMUL_EXCEPTION;
1238 return X86EMUL_OKAY;
1242 static const struct x86_emulate_ops priv_op_ops = {
1243 .insn_fetch = insn_fetch,
1244 .read = x86emul_unhandleable_rw,
1245 .validate = validate,
1247 .write_io = write_io,
1249 .rep_outs = rep_outs,
1250 .read_segment = read_segment,
1252 .write_cr = write_cr,
1253 .read_dr = x86emul_read_dr,
1254 .write_dr = x86emul_write_dr,
1255 .write_xcr = x86emul_write_xcr,
1256 .read_msr = read_msr,
1257 .write_msr = write_msr,
1258 .cpuid = pv_emul_cpuid,
1262 int pv_emulate_privileged_op(struct cpu_user_regs *regs)
1264 struct vcpu *curr = current;
1265 struct domain *currd = curr->domain;
1266 struct priv_op_ctxt ctxt = {
1268 .ctxt.vendor = currd->arch.cpuid->x86_vendor,
1269 .ctxt.lma = !is_pv_32bit_domain(currd),
1272 unsigned int eflags, ar;
1274 if ( !pv_emul_read_descriptor(regs->cs, curr, &ctxt.cs.base,
1275 &ctxt.cs.limit, &ar, 1) ||
1276 !(ar & _SEGMENT_S) ||
1277 !(ar & _SEGMENT_P) ||
1278 !(ar & _SEGMENT_CODE) )
1281 /* Mirror virtualized state into EFLAGS. */
1282 ASSERT(regs->eflags & X86_EFLAGS_IF);
1283 if ( vcpu_info(curr, evtchn_upcall_mask) )
1284 regs->eflags &= ~X86_EFLAGS_IF;
1286 regs->eflags |= X86_EFLAGS_IF;
1287 ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
1288 regs->eflags |= curr->arch.pv.iopl;
1289 eflags = regs->eflags;
1291 ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
1292 /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
1293 rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
1295 if ( ctxt.io_emul_stub )
1296 unmap_domain_page(ctxt.io_emul_stub);
1299 * Un-mirror virtualized state from EFLAGS.
1300 * Nothing we allow to be emulated can change anything other than the
1301 * arithmetic bits, and the resume flag.
1303 ASSERT(!((regs->eflags ^ eflags) &
1304 ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
1305 regs->eflags |= X86_EFLAGS_IF;
1306 regs->eflags &= ~X86_EFLAGS_IOPL;
1311 if ( ctxt.ctxt.retire.singlestep )
1312 ctxt.bpmatch |= DR_STEP;
1315 curr->arch.dr6 |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
1316 if ( !(curr->arch.pv.trap_bounce.flags & TBF_EXCEPTION) )
1317 pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1321 return EXCRET_fault_fixed;
1323 case X86EMUL_EXCEPTION:
1324 pv_inject_event(&ctxt.ctxt.event);
1325 return EXCRET_fault_fixed;
1334 * c-file-style: "BSD"
1337 * indent-tabs-mode: nil